home *** CD-ROM | disk | FTP | other *** search
- # This module compiles a regular expression that recognizes Python tokens.
- # It is designed to match the working of the Python tokenizer exactly.
- # It takes care of everything except indentation;
- # note that un-escaped newlines are tokens, too.
- # tokenprog.regs[3] gives the location of the token without whitespace
- # It also defines various subexpressions, but doesn't compile them.
- # See the function test() below for an example of how to use.
-
- import regex
-
- # Note: to get a quoted backslash in a regexp, it must be quadrupled.
-
- Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
-
- Name = '[a-zA-Z_][a-zA-Z0-9_]*'
-
- Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
- Octnumber = '0[0-7]*[lL]?'
- Decnumber = '[1-9][0-9]*[lL]?'
- Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
- Exponent = '[eE][-+]?[0-9]+'
- Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
- Expfloat = '[0-9]+' + Exponent
- Floatnumber = Pointfloat + '\|' + Expfloat
- Number = Floatnumber + '\|' + Intnumber
-
- String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"'
- # Note: this module *recognizes* double quotes, but for backward
- # compatibility, it doesn't *use* them!
-
- Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
- Bracket = '[][(){}]'
- Special = '[:;.,`\n]'
- Funny = Operator + '\|' + Bracket + '\|' + Special
-
- PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
-
- Token = Ignore + '\(' + PlainToken + '\)'
-
- try:
- save_syntax = regex.set_syntax(0) # Use default syntax
- tokenprog = regex.compile(Token)
- finally:
- if save_syntax != 0:
- dummy = regex.set_syntax(save_syntax) # Restore original syntax
-
-
- def test(file):
- f = open(file, 'r')
- while 1:
- line = f.readline()
- if not line: break
- i, n = 0, len(line)
- while i < n:
- j = tokenprog.match(line, i)
- if j < 0:
- print 'No token at', `line[i:i+20]` + '...'
- i = i+1
- else:
- i = i+j
- a, b = tokenprog.regs[3]
- if a < b:
- print 'Token:', `line[a:b]`
-